#remove scientific notation
options(scipen=999)

—- Load packages —-

library(stringr)
library(corrplot)
## corrplot 0.84 loaded

—- Load neighborhood data —-

load("Data/county_factors.rda")
load("Data/county_500CitiesData.rda")

—- Load and format covid data —-

data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/"

# Read in the data
US.deaths <- read.csv(
  paste0(data.path, "time_series_covid19_deaths_US.csv"), 
  header = T, stringsAsFactors = F)
US.cases <- read.csv(
  paste0(data.path, "time_series_covid19_confirmed_US.csv"), 
  header = T, stringsAsFactors = F)

# Read in the header seprately.
US.cases.head <- read.csv(
  paste0(data.path, "time_series_covid19_confirmed_US.csv"), 
  header = F, stringsAsFactors = F)[1,]
US.deaths.head <- read.csv(
  paste0(data.path, "time_series_covid19_deaths_US.csv"), 
  header = F, stringsAsFactors = F)[1,]

# Correct the dates in the header to be more useable as
# column names.
proper_date <- function(dates){
  dates <- sapply(dates, strsplit, split = "/")
  dates <- lapply(dates, str_pad, width = 2, side = "left", pad = "0")
  dates <- lapply(dates, paste, collapse = "_")
  dates <- unlist(dates)
  
  return(dates)
}

dates.cases <- proper_date(US.cases.head[-c(1:11)])
dates.deaths <- proper_date(US.deaths.head[-c(1:12)])

names(US.cases) <- c(US.cases.head[1,1:11], dates.cases)
names(US.deaths) <- c(US.deaths.head[1,1:12], dates.deaths)

if(sum(US.cases$UID != US.deaths$UID) > 0){warning("COVID data rows do not match!")}
US.cases$Population <- US.deaths$Population
US.cases <- US.cases[,c(1:11, ncol(US.cases), 12:(ncol(US.cases)-1))]

Split the dataset into the data and the info for usability.

US.cases.info <- as.matrix(US.cases[,1:12])
US.cases.data <- as.matrix(US.cases[,-c(2:12)])
US.deaths.info <- as.matrix(US.deaths[,1:12])
US.deaths.data <- as.matrix(US.deaths[,-c(2:12)])

rownames(US.cases.info) <- US.cases.info[,1]
US.cases.info <- US.cases.info[,-1]
rownames(US.cases.data) <- US.cases.data[,1]
US.cases.data <- US.cases.data[,-1]
rownames(US.deaths.info) <- US.deaths.info[,1]
US.deaths.info <- US.deaths.info[,-1]
rownames(US.deaths.data) <- US.deaths.data[,1]
US.deaths.data <- US.deaths.data[,-1]


ndays.cases <- ncol(US.cases.data)
ndays.deaths <- ncol(US.deaths.data)

nobs <- nrow(US.cases.data)

—- The Curve —-

state.curve <- function(state, stat = c("cases", "deaths")){
  if(stat == "cases"){
    data <- US.cases.data[which(US.cases$Province_State == state),]
  }else if(stat == "deaths"){
    data <- US.deaths.data[which(US.deaths$Province_State == state),]
  }
  data.sum <- colSums(data)
  day.first.case <- min(which(data.sum > 0))
  n.days <- length(data.sum)
  barplot(data.sum[day.first.case:n.days], 
          main = paste0("Total COVID-19 ", stat," by date in ", state), 
          log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}

state.curve("New York", "cases")

state.curve("New York", "deaths")

state.curve("California", "cases")

state.curve("California", "deaths")

state.curve("Oklahoma", "cases")

state.curve("Oklahoma", "deaths")

county.curve <- function(county, stat = c("cases", "deaths")){
  if(stat == "cases"){
    data <- US.cases.data[which(US.cases$Admin2 == county),]
  }else if(stat == "deaths"){
    data <- US.deaths.data[which(US.deaths$Admin2 == county),]
  }
  
  day.first.case <- min(which(data > 0))
  n.days <- length(data)
  
barplot(data[day.first.case:n.days], main = paste0("Total COVID-19 ", stat," by date in ", county), log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
  
}

county.curve("Tulsa", "cases")

county.curve("Tulsa", "deaths")

—- Calculate some useful stats to compare with neighborhood data —-

US.stats <- data.frame(UID = US.cases$UID)
cases.total <- colSums(US.cases.data)

day.first.case <- min(which(cases.total > 100))
n.days <- length(cases.total)

par(mar = c(5,5,4,2))
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")

deaths.total <- colSums(US.deaths.data)

day.first.case <- min(which(deaths.total > 0))
n.days <- length(deaths.total)

barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")

Average rise in cases per day

avg.rise.cases

rise.cases <- matrix(ncol = ndays.cases - 1, nrow = nobs)
colnames(rise.cases) <- colnames(US.cases.data)[-1]
for(i in 1:ncol(rise.cases) + 1){
  rise <- US.cases.data[,i] - US.cases.data[,i-1]
  rise.cases[,i-1] <- rise
}

US.stats$avg.rise.cases <- apply(rise.cases, 1, mean)

rise.cases.total <- colSums(rise.cases)

day.first.case <- min(which(rise.cases.total > 0))
n.days <- length(rise.cases.total)

barplot(rise.cases.total[day.first.case:n.days], main = "Rise in Cases of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

Average rise in deaths per day

avg.rise.deaths

rise.deaths <- matrix(ncol = ndays.deaths - 1, nrow = nobs)
colnames(rise.deaths) <- colnames(US.deaths.data)[-1]
for(i in 1:ncol(rise.deaths) + 1){
  rise <- US.deaths.data[,i] - US.deaths.data[,i-1]
  rise.deaths[,i-1] <- rise
}

US.stats$avg.rise.deaths <- apply(rise.deaths, 1, mean)

rise.deaths.total <- colSums(rise.deaths)

day.first.case <- min(which(rise.deaths.total > 0))
n.days <- length(rise.deaths.total)

barplot(rise.deaths.total[day.first.case:n.days], main = "Rise in Deaths of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

Total cases

total.cases

US.stats$total.cases <- US.cases.data[,ndays.cases]

Total cases per capita

US.stats$total.cases.percap <- US.stats$total.cases / US.cases$Population
US.stats$total.cases.percap[US.cases$Population == 0] <- NA
hist(US.stats$total.cases.percap)

Total deaths

total.deaths

US.stats$total.deaths <- US.deaths.data[,ndays.deaths]

Total deaths per capita

total.deaths.percap

US.stats$total.deaths.percap <- US.stats$total.deaths / US.deaths$Population
US.stats$total.deaths.percap[US.deaths$Population == 0] <- NA

max(US.stats$total.deaths.percap,na.rm = T)
## [1] 0.002516538

Total deaths per case

total.deaths.percase Error in Johns Hopkins data has rows with total.deaths > total.cases.

# pos.case.ind <- US.stats$total.cases > 0
# US.stats$total.deaths.percase[pos.case.ind] <- US.stats$total.deaths[pos.case.ind] / US.stats$total.cases[pos.case.ind]
# US.stats$total.deaths.percase[!pos.case.ind] <- 0
US.stats$total.deaths.percase <- US.stats$total.deaths / US.stats$total.cases
US.stats$total.deaths.percase[US.stats$total.cases == 0] <- NA

US.stats[which(US.stats$total.deaths > US.stats$total.cases),]
##           UID avg.rise.cases avg.rise.deaths total.cases
## 3203 84090002     0.00000000      0.04494382           0
## 3204 84090004     0.00000000      0.01123596           0
## 3206 84090006     0.00000000      0.02247191           0
## 3222 84090024     0.00000000      1.03370787           0
## 3231 84090033     0.04494382      0.43820225           4
## 3234 84090036     0.00000000      0.53932584           0
## 3248 84090051     0.00000000      1.30337079           0
## 3250 84090054     0.00000000      0.19101124           0
## 3252 84090056     0.00000000      0.01123596           0
##      total.cases.percap total.deaths total.deaths.percap
## 3203                 NA            4                  NA
## 3204                 NA            1                  NA
## 3206                 NA            2                  NA
## 3222                 NA           92                  NA
## 3231                 NA           39                  NA
## 3234                 NA           48                  NA
## 3248                 NA          116                  NA
## 3250                 NA           17                  NA
## 3252                 NA            1                  NA
##      total.deaths.percase
## 3203                   NA
## 3204                   NA
## 3206                   NA
## 3222                   NA
## 3231                 9.75
## 3234                   NA
## 3248                   NA
## 3250                   NA
## 3252                   NA

—- Merge COVID data with Neighborhood data —-

US.stats$ID <- str_pad(US.stats$UID, 8, "left", pad = "0")
US.stats$ID <- substr(US.stats$ID, 4, 8)

data.merge <- merge(US.stats, county_factors, by = "ID")

—- Plot the relationships —-

data.cor <- cor(data.merge[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)

data.merge2 <- merge(data.merge, county_500CitiesData, by = "ID", all.x = F)

—- Plot the relationships —-

data.cor2 <- cor(data.merge2[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor2, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)

corrplot.mixed(data.cor2[1:7,8:42], upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 0.5, lower.col = "black", number.cex = 0.25)

corrplot.mixed(data.cor2[1:7,8:42], upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)